In this notebook, we will perform EDA on the data using techniques taught in class. We will primarily be doing semantic analysis (Topic Modelling), Syntactic Analysis (NER-Separate Notebook), Sentiment Analysis and Statistical Analysis. For full elaboration on the results, please refer to the report.
import pandas as pd
import numpy as np
import os
import nltk
import gc
import warnings
import re
import sys
import csv
import pickle
from time import ctime, time
from collections import defaultdict
import pyLDAvis
import pyLDAvis.gensim # don't skip this
import seaborn as sns
%matplotlib inline
import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.models import LsiModel,LdaModel,TfidfModel, HdpModel
from gensim.models.hdpmodel import HdpModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models.coherencemodel import CoherenceModel
import nltk
from nltk.tokenize import RegexpTokenizer,sent_tokenize,word_tokenize,TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from textblob import TextBlob
from wordcloud import WordCloud
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import warnings
warnings.filterwarnings("ignore")
#project_dir = r"C:\Users\User\Desktop\JeremyWork\BT4222\"
data = pd.read_csv("cleaned_train.csv")
data_v2 = pd.read_csv("sanitize1_train.csv")
data.head()
#check for missing values
data['comment_text'].isnull().sum()
# Fill na with empty string
data['comment_text'].fillna("", inplace=True)
data['class_count'] = data.iloc[:,2:].sum(axis = 1, skipna = True)
multi_class=data.iloc[:,2:].sum(axis = 1, skipna = True).value_counts()
normal = multi_class[0]
anomalous = sum(multi_class[1:])
plt.figure(figsize=(8,4))
plt.title("Count per class")
plt.ylabel('Count of Occurrences', fontsize=10)
plt.xlabel('Class', fontsize=12)
ax = sns.barplot(["normal","toxic"],np.array([normal,anomalous]))
x=data.iloc[:,2:-1].sum()
plt.figure(figsize=(8,4))
ax= sns.barplot(x.index, x.values)
plt.title("Count per class", fontsize=12)
plt.ylabel('Count of Occurrences', fontsize=12)
plt.xlabel('Toxic Type', fontsize=12)
sns.set(font_scale=1)
multi_class=data.iloc[:,2:-1].sum(axis = 1, skipna = True).value_counts()
plt.figure(figsize=(8,4))
ax = sns.barplot(multi_class.index, multi_class.values,color='grey')
plt.title("Multiple tags per Comment")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('# of tags ', fontsize=12)
main_data=data.iloc[:,2:-1]
# filter temp by removing clean comments
# temp_df=temp_df[~train.clean]
corr=main_data.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr,xticklabels=corr.columns.values,yticklabels=corr.columns.values, annot=True,cmap='Blues')
normal_text = data[data['class_count'] == 0]
new_stopwords = stop_words + ["one","two","three","four","five","six","seven","eight","nine"]
# list_sent = [" ".join(text) for text in text_data]
# all_words = ' '.join([text for text in list_sent])
all_words = ' '.join([text for text in normal_text["comment_text"]])
wordcloud = WordCloud(max_words=2000,width=800, height=500, random_state=21, max_font_size=110,stopwords=new_stopwords,collocations=False).generate(all_words)
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
data_dict = {}
data_dict['normal'] = normal_text
for i in data.columns[2:-1]:
data_dict[i] = data[data[i] == 1]
# df_toxic = data[data.toxic == 1]
# df_severe_toxic = data[data.severe_toxic == 1]
# df_obscene = data[data.obscene == 1]
# df_threat = data[data.threat == 1]
# df_insult = data[data.insult == 1]
# df_identity_hate = data[data.identity_hate == 1]
data_dict_v2 = {}
for i in data.columns[2:-1]:
data_dict_v2[i] = data_v2[data_v2[i] == 1]
[i for i in data_dict]
sid = SentimentIntensityAnalyzer()
sentiment_dict = {}
for i in data_dict:
class_sentiment = []
curr_text_data = data_dict[i]['comment_text'].tolist()
for j in curr_text_data:
try:
class_sentiment.append(sid.polarity_scores(j))
except:
print(i)
class_sentiment.append(0)
sentiment_dict[i] = class_sentiment
import pickle
project_dir = r"C:\Users\User\Desktop\BT4222\New folder"
#pickle.dump(sentiment_dict,open(os.path.join(project_dir,'sentiment_dict.p'),'wb'))
sentiment_dict = pickle.load(open(os.path.join(project_dir,'sentiment_dict.p'),"rb"))
for i in sentiment_dict:
curr_sentiments = pd.DataFrame(sentiment_dict[i])
curr_class = curr_sentiments.assign(Text = data_dict[i]['comment_text'].tolist()).iloc[:,[-1, 0,1,2,3]]
sentiment_dict[i] = curr_class
sentiment_dict['toxic']
fig = plt.figure(constrained_layout=False, figsize=(20,100))
spec = fig.add_gridspec(24, 3, wspace=0.3, hspace=0.5)
sns.set(font_scale=1.5)
for i,c in enumerate(sentiment_dict):
axes = fig.add_subplot(spec[i])
if c == "normal":
sns.distplot(sentiment_dict[c]['compound'],color="g")
else:
sns.distplot(sentiment_dict[c]['compound'],color="r")
fig.add_subplot(axes)
axes.title.set_text(c)
fig.show()
# plt.figure(figsize=(6, 6))
# sns.set(font_scale=1)
# sns.distplot(sentiment_dict['toxic']['compound'])
We will look at what are the common topics talk about in each type of toxic comments
prepare_corpus_part1 - Lowercase, tokenization, removal of stop words and generates bigrams and trigrams on text
prepare_corpus_part2 - POS tag each token and keep only 'NOUN','VERB','ADJECTIVE', this is decided after trying various combinations of tags.
prepare_corpus_part3 - Generates the dictionary and corpus needed for the modelling. Preprocessing is also done on the dictionary such as removing extremes (Too little occurance or too many).
stop_words = stopwords.words('english')
stop_words = stop_words + ["br","a","zero",'one','two','three','four','five','six','seven','eight','nine']
def prepare_corpus_part1(df,bigram_min,trigram_min):
df["comment_text"] = df["comment_text"].replace(np.nan, '', regex=True)
text_data = df['comment_text'].tolist()
text_data = [simple_preprocess(text,deacc=True) for text in text_data]
text_data = [[token for token in text if token not in stop_words] for text in text_data]
bigram = Phrases(text_data,min_count=bigram_min)
trigram = Phrases(bigram[text_data], min_count=trigram_min)
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)
# text_data = [trigram_mod[bigram_mod[text]] for text in text_data]
# text_data = [bigram_mod[text] for text in text_data]
return text_data
def prepare_corpus_part2(text_data,tags):
result = []
size = len(text_data)
tracker = 0
for tokenized_text in text_data:
tracker += 1
if tracker % 1000 == 0:
print(f"Currently processing sentence {tracker}/{size}")
tokens_tagged = nltk.pos_tag(tokenized_text)
sub_result = [tokens for tokens in tokens_tagged if tokens[1] in tags]
result.append(sub_result)
return result
from nltk.corpus import wordnet
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
else:
return ''
def prepare_corpus_part3(text_data,ext_below,ext_above):
lemmatizer = WordNetLemmatizer()
text_data = [[lemmatizer.lemmatize(token[0],get_wordnet_pos(token[1])) for token in text if len(lemmatizer.lemmatize(token[0],get_wordnet_pos(token[1]))) >= 3] for text in text_data]
dictionary = corpora.Dictionary(text_data)
dictionary.filter_extremes(no_below=ext_below, no_above=ext_above)
dictionary.compactify()
corpus = [dictionary.doc2bow(text) for text in text_data]
return dictionary, corpus, text_data
# compare different models and plot
def compute_coherence_values(dictionary, corpus, text_data, stop, start, step,iterations,passes):
coherence_values = []
model_list = []
for num_topics in range(start, stop, step):
print ('Start modeling topic {} at {}'.format(num_topics,ctime()))
model = LdaModel(corpus, num_topics=num_topics, alpha='auto', id2word = dictionary,random_state=4222,iterations=iterations,passes=passes)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=text_data, corpus=corpus, dictionary = dictionary, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
print ('Model of {} topics has the coherence {}'.format(num_topics,coherencemodel.get_coherence()))
x = range(start, stop, step)
plt.plot(x, coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
return model_list, coherence_values
def format_unique_topics(topics):
unique_topics = []
for i in topics:
if i[1] not in unique_topics:
unique_topics.append(i[1])
print("Number of unique topics:",len(unique_topics))
formatted_topics = []
for ind,j in enumerate(unique_topics):
formatted_topics.append([ind,(j)])
return formatted_topics
def convert_to_table(topics):
import re
sub_words = []
for topic in topics:
sub_words.append(re.findall(r'"(.*?)"', topic[1]))
df = pd.DataFrame.from_records(sub_words)
df = df.transpose()
columns = ["Topic " + str(i) for i in range(1,len(topics)+1)]
index = ["word " + str(i) for i in range(1,21)]
df.columns = columns
df.index = index
return df
def popular_ngrams(df,n,top):
df["cleaned_text"] = df['comment_text'].str.lower()
df["cleaned_text"] = df["cleaned_text"].replace(np.nan, '', regex=True)
df["cleaned_text"] = df["cleaned_text"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
ps = PorterStemmer()
df["cleaned_text"] = df["cleaned_text"].apply(lambda x: ' '.join([ps.stem(w) for w in x.split()]))
word_vectorizer = CountVectorizer(ngram_range=(n,n), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df["cleaned_text"])
frequencies = sum(sparse_matrix).toarray()[0]
results = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
results.sort_values(by=['frequency'],axis=0,ascending=False,inplace=True)
results.head(70)
plt.figure(figsize=(16, 16))
sns.set(font_scale=2)
ax = sns.barplot(x=results["frequency"][0:top],y=results.index.values[0:top])
ax.set_title(f"Top {top} Most Frequent ngrams (removed stopword and performed stemming)")
tags = ['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
#tags = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
preprocess_text1 = prepare_corpus_part1(data_dict['normal'],20,10)
preprocess_text2 = prepare_corpus_part2(preprocess_text1,tags)
dictionary, corpus, text_data = prepare_corpus_part3(preprocess_text2,5,0.6)
model_list_clean, coherence_values_clean = compute_coherence_values(dictionary, corpus, text_data, 101, 5, 5,100,10)
df_coherence_clean = pd.DataFrame(list(zip(list(range(2,11,1)),coherence_values_clean)),columns=('num_topics','coherence_value'))
best_index_clean = coherence_values_clean.index(max(coherence_values_clean))
best_model_clean = model_list_clean[best_index_clean]
dfclean = convert_to_table(format_unique_topics(model_list_clean[1].print_topics(num_topics=100, num_words=20)))
dfclean
popular_ngrams(data_dict['toxic'],2,20)
popular_ngrams(data_dict['toxic'],1,20)
# Break Down of toxic catogory and overlaps
data_toxic = {}
for i in data.columns[2:]:
data_toxic[i] = data_dict['toxic'][data_dict['toxic'][i] == 1]
plot_dist(data_toxic)
tags = ['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
#tags = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
preprocess_text1 = prepare_corpus_part1(data_dict['toxic'],20,10)
preprocess_text2 = prepare_corpus_part2(preprocess_text1,tags)
dictionary, corpus, text_data = prepare_corpus_part3(preprocess_text2,5,0.6)
# new_stopwords = stop_words + ["one","two","three","four","five","six","seven","eight","nine"]
list_sent = [" ".join(text) for text in text_data]
all_words = ' '.join([text for text in list_sent])
#all_words = ' '.join([text for text in data_dict['toxic']["comment_text"]])
wordcloud = WordCloud(max_words=2000,width=800, height=500, random_state=21, max_font_size=110,stopwords=stop_words,collocations=False).generate(all_words)
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
model_list_toxic, coherence_values_toxic = compute_coherence_values(dictionary, corpus, text_data, 11, 2, 1,100,10)
df_coherence_toxic = pd.DataFrame(list(zip(list(range(2,11,1)),coherence_values_toxic)),columns=('num_topics','coherence_value'))
best_index_toxic = coherence_values_toxic.index(max(coherence_values_toxic))
best_model_toxic = model_list_toxic[best_index_toxic]
dftoxic = convert_to_table(format_unique_topics(model_list_toxic[4].print_topics(num_topics=100, num_words=20)))
dftoxic
popular_ngrams(data_dict['severe_toxic'],1,20)
popular_ngrams(data_dict['severe_toxic'],2,20)
# Break Down
data_stoxic = {}
for i in data.columns[2:]:
data_stoxic[i] = data_dict['severe_toxic'][data_dict['severe_toxic'][i] == 1]
plot_dist(data_stoxic)
tags = ['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
#tags = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
preprocess_text1 = prepare_corpus_part1(data_dict['severe_toxic'],20,10)
preprocess_text2 = prepare_corpus_part2(preprocess_text1,tags)
dictionary, corpus, text_data = prepare_corpus_part3(preprocess_text2,5,0.6)
# new_stopwords = stop_words + ["one","two","three","four","five","six","seven","eight","nine"]
list_sent = [" ".join(text) for text in text_data]
all_words = ' '.join([text for text in list_sent])
#all_words = ' '.join([text for text in data_dict['toxic']["comment_text"]])
wordcloud = WordCloud(max_words=2000,width=800, height=500, random_state=21, max_font_size=110,stopwords=stop_words,collocations=False).generate(all_words)
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
model_list_stoxic, coherence_values_stoxic = compute_coherence_values(dictionary, corpus, text_data, 11, 2, 1,100,10)
df_coherence_stoxic = pd.DataFrame(list(zip(list(range(2,11,1)),coherence_values_stoxic)),columns=('num_topics','coherence_value'))
best_index_stoxic = coherence_values_stoxic.index(max(coherence_values_stoxic))
best_model_stoxic = model_list_stoxic[best_index_stoxic]
dfstoxic = convert_to_table(format_unique_topics(model_list_stoxic[0].print_topics(num_topics=100, num_words=20)))
dfstoxic
popular_ngrams(data_dict['obscene'],1,20)
popular_ngrams(data_dict['obscene'],2,20)
# Break Down
data_obscene = {}
for i in data.columns[2:]:
data_obscene[i] = data_dict['obscene'][data_dict['obscene'][i] == 1]
plot_dist(data_obscene)
tags = ['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
#tags = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
preprocess_text1 = prepare_corpus_part1(data_dict['obscene'],20,10)
preprocess_text2 = prepare_corpus_part2(preprocess_text1,tags)
dictionary, corpus, text_data = prepare_corpus_part3(preprocess_text2,5,0.5)
new_stopwords = stop_words + ["one","two","three","four","five","six","seven","eight","nine"]
list_sent = [" ".join(text) for text in text_data]
all_words = ' '.join([text for text in list_sent])
#all_words = ' '.join([text for text in data_dict['toxic']["comment_text"]])
wordcloud = WordCloud(max_words=2000,width=800, height=500, random_state=21, max_font_size=110,stopwords=stop_words,collocations=False).generate(all_words)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
model_list_obscene, coherence_values_obscene = compute_coherence_values(dictionary, corpus, text_data, 11, 2, 1,100,10)
df_coherence_obscene = pd.DataFrame(list(zip(list(range(2,11,1)),coherence_values_obscene)),columns=('num_topics','coherence_value'))
best_index_obscene = coherence_values_obscene.index(max(coherence_values_obscene))
best_model_obscene = model_list_obscene[best_index_obscene]
dfobscene = convert_to_table(format_unique_topics(model_list_obscene[0].print_topics(num_topics=100, num_words=20)))
dfobscene
popular_ngrams(data_dict['threat'],1,20)
popular_ngrams(data_dict['threat'],2,20)
# Break Down
data_threat = {}
for i in data.columns[2:]:
data_threat[i] = data_dict['threat'][data_dict['threat'][i] == 1]
plot_dist(data_threat)
tags = ['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
#tags = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
preprocess_text1 = prepare_corpus_part1(data_dict['threat'],20,10)
preprocess_text2 = prepare_corpus_part2(preprocess_text1,tags)
dictionary, corpus, text_data = prepare_corpus_part3(preprocess_text2,5,0.6)
new_stopwords = stop_words + ["one","two","three","four","five","six","seven","eight","nine"]
list_sent = [" ".join(text) for text in text_data]
all_words = ' '.join([text for text in list_sent])
#all_words = ' '.join([text for text in data_dict['toxic']["comment_text"]])
wordcloud = WordCloud(max_words=2000,width=800, height=500, random_state=21, max_font_size=110,stopwords=stop_words,collocations=False).generate(all_words)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
model_list_threat, coherence_values_threat = compute_coherence_values(dictionary, corpus, text_data, 11, 2, 1,100,10)
df_coherence_threat = pd.DataFrame(list(zip(list(range(2,11,1)),coherence_values_threat)),columns=('num_topics','coherence_value'))
best_index_threat = coherence_values_threat.index(max(coherence_values_threat))
best_model_threat = model_list_threat[best_index_threat]
dfthreat = convert_to_table(format_unique_topics(model_list_threat[2].print_topics(num_topics=100, num_words=20)))
dfthreat
popular_ngrams(data_dict['insult'],1,20)
popular_ngrams(data_dict['insult'],2,20)
# Break Down
data_insult = {}
for i in data.columns[2:]:
data_insult[i] = data_dict['insult'][data_dict['insult'][i] == 1]
plot_dist(data_insult)
tags = ['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
#tags = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
preprocess_text1 = prepare_corpus_part1(data_dict['insult'],20,10)
preprocess_text2 = prepare_corpus_part2(preprocess_text1,tags)
dictionary, corpus, text_data = prepare_corpus_part3(preprocess_text2,5,0.6)
new_stopwords = stop_words + ["one","two","three","four","five","six","seven","eight","nine"]
list_sent = [" ".join(text) for text in text_data]
all_words = ' '.join([text for text in list_sent])
#all_words = ' '.join([text for text in data_dict['toxic']["comment_text"]])
wordcloud = WordCloud(max_words=2000,width=800, height=500, random_state=21, max_font_size=110,stopwords=stop_words,collocations=False).generate(all_words)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
model_list_insult, coherence_values_insult = compute_coherence_values(dictionary, corpus, text_data, 21, 2, 2,100,10)
df_coherence_insult = pd.DataFrame(list(zip(list(range(2,11,1)),coherence_values_insult)),columns=('num_topics','coherence_value'))
best_index_insult = coherence_values_insult.index(max(coherence_values_insult))
best_model_insult = model_list_insult[best_index_insult]
dfinsult = convert_to_table(format_unique_topics(best_model_insult.print_topics(num_topics=10, num_words=20)))
dfinsult
popular_ngrams(data_dict['identity_hate'],1,20)
popular_ngrams(data_dict['identity_hate'],2,20)
# Break Down
data_hate = {}
for i in data.columns[2:]:
data_hate[i] = data_dict['identity_hate'][data_dict['identity_hate'][i] == 1]
plot_dist(data_hate)
tags = ['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
#tags = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
preprocess_text1 = prepare_corpus_part1(data_dict['identity_hate'],20,10)
preprocess_text2 = prepare_corpus_part2(preprocess_text1,tags)
dictionary, corpus, text_data = prepare_corpus_part3(preprocess_text2,5,0.6)
new_stopwords = stop_words + ["one","two","three","four","five","six","seven","eight","nine"]
list_sent = [" ".join(text) for text in text_data]
all_words = ' '.join([text for text in list_sent])
#all_words = ' '.join([text for text in data_dict['toxic']["comment_text"]])
wordcloud = WordCloud(max_words=2000,width=800, height=500, random_state=21, max_font_size=110,stopwords=stop_words,collocations=False).generate(all_words)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
model_list_hate, coherence_values_hate = compute_coherence_values(dictionary, corpus, text_data, 11, 2, 1,100,10)
df_coherence_hate = pd.DataFrame(list(zip(list(range(2,11,1)),coherence_values_hate)),columns=('num_topics','coherence_value'))
best_index_hate = coherence_values_hate.index(max(coherence_values_hate))
best_model_hate = model_list_hate[best_index_hate]
dfhate = convert_to_table(format_unique_topics(model_list_hate[0].print_topics(num_topics=100, num_words=20)))
dfhate